In [ ]:
%run "../Functions/1. Game sessions.ipynb"
print("2. Google form analysis")
In [ ]:
# special user ids
# 1.52
userIdThatDidNotAnswer = '001c95c6-8207-43dc-a51b-adf0c6e005d7'
userId1AnswerEN = '00dbbdca-d86c-4bc9-803c-0602e0153f68'
userIdAnswersEN = '5977184a-1be2-4725-9b48-f2782dc03efb'
userId1ScoreEN = '6b5d392d-b737-49ef-99af-e8c445ff6379'
userIdScoresEN = '5ecf601d-4eac-433e-8056-3a5b9eda0555'
userId1AnswerFR = '2734a37d-4ba5-454f-bf85-1f7b767138f6'
userIdAnswersFR = '01e85778-2903-447b-bbab-dd750564ee2d'
userId1ScoreFR = '3d733347-0313-441a-b77c-3e4046042a53'
userIdScoresFR = '58d22690-8604-41cf-a5b7-d71fb3b9ad5b'
userIdAnswersENFR = 'a7936587-8b71-43b6-9c61-17b2c2b55de3'
# 1.52.2
userIdThatDidNotAnswer = '6919aa9a-f18e-4fc5-8435-c26b869ba571'
userIdAnswersFR = '0135e29b-678d-4188-a935-1d0bfec9450b'
userIdScoresFR = '0135e29b-678d-4188-a935-1d0bfec9450b'
userId1AnswerFR = '01cc303e-d7c1-4c84-8e17-182b410da343'
userId1ScoreFR = '01cc303e-d7c1-4c84-8e17-182b410da343'
userId1AnswerEN = '027fb5ca-c40a-4977-852a-e448538061f2'
userId1ScoreEN = '027fb5ca-c40a-4977-852a-e448538061f2'
userIdAnswersEN = '1e94b693-df8f-4ad0-9f02-4aac6929bdaa'
userIdScoresEN = '1e94b693-df8f-4ad0-9f02-4aac6929bdaa'
userIdAnswersENFR = '2ad10897-b143-45f4-9a78-60ee4bcecc80'
In [ ]:
#localplayerguidkey = 'Ne pas modifier - identifiant anonyme prérempli'
localplayerguidkey = 'userId'
localplayerguidindex = gform.columns.get_loc(localplayerguidkey)
localplayerguidindex
In [ ]:
firstEvaluationQuestionKey = QGenotypePhenotype
firstEvaluationQuestionIndex = gform.columns.get_loc(firstEvaluationQuestionKey)
firstEvaluationQuestionIndex
In [ ]:
answersColumnNameStem = "answers"
correctionsColumnNameStem = "corrections"
In [ ]:
def getUniqueUserCount(gfDF):
return gfDF[localplayerguidkey].nunique()
In [ ]:
def getAllResponders( _gfDF ):
userIds = _gfDF[localplayerguidkey].unique()
return userIds
def getRandomGFormGUID():
_uniqueUsers = getAllResponders()
_userCount = len(_uniqueUsers)
_guid = '0'
while (not isGUIDFormat(_guid)):
_userIndex = randint(0,_userCount-1)
_guid = _uniqueUsers[_userIndex]
return _guid
def hasAnswered( userId, _gfDF ):
return userId in _gfDF[localplayerguidkey].values
def getAnswers( userId, _gfDF ):
answers = _gfDF[_gfDF[localplayerguidkey]==userId]
_columnAnswers = answers.T
if 0 != len(answers):
_newColumns = []
for column in _columnAnswers.columns:
_newColumns.append(answersColumnNameStem + str(column))
_columnAnswers.columns = _newColumns
else:
# user has never answered
print("user " + str(userId) + " has never answered")
return _columnAnswers
In [ ]:
def resetTemporalities(_gfDF):
_gfDF[QTemporality] = answerTemporalities[2]
In [ ]:
#gform[QPlayed].unique()
In [ ]:
# answers that show that this survey was a pretest
alreadyPlayedPretestAnswers = [
'No / not yet',
# 'I just played for the first time',
'I played it some time ago', # certainly an older version of the game
# 'I played it multiple times recently',
# 'I played recently on an other computer', # has to fill in profile questions again
# 'I played it multiple times recently on this computer'
]
alreadyPlayedPosttestAnswers = [
# 'No / not yet',
'I just played for the first time',
# 'I played it some time ago',
'I played it multiple times recently',
'I played recently on an other computer',
'I played it multiple times recently on this computer'
]
# based only on user answer
APlayedButProfileAgain = 'I played recently on an other computer'
def setAnswerTemporalitiesSimple(_gfDF):
# check whether temporalities have already been set
if(len(_gfDF[QTemporality].unique()) == 1):
for _index in _gfDF.index:
if _gfDF.loc[_index, QPlayed] in alreadyPlayedPretestAnswers:
_gfDF.loc[_index,QTemporality] = answerTemporalities[0]
else:
_gfDF.loc[_index,QTemporality] = answerTemporalities[1]
print("temporalities set (user answer method)")
In [ ]:
# based only on first meaningful game event
def setAnswerTemporalities(_gfDF):
# check whether temporalities have already been set
if(len(_gfDF[QTemporality].unique()) == 1):
# format : key = _userId, value = [_firstEventDate, 0 or _gfDF.index of before, 0 or _gfDF.index of after]
temporalities = {}
for _index in _gfDF.index:
_userId = _gfDF.loc[_index,localplayerguidkey]
_firstEventDate, beforeIndex, afterIndex = [0,0,0]
if _userId in temporalities:
_firstEventDate, beforeIndex, afterIndex = temporalities[_userId]
else:
_firstEventDate = getFirstEventDate(_userId)
temporality = getTemporality(_gfDF.loc[_index,QTimestamp],_firstEventDate)
if temporality == answerTemporalities[0] and beforeIndex != 0 :
if _gfDF.loc[_index,QTimestamp] > _gfDF.loc[beforeIndex,QTimestamp]:
_gfDF.loc[beforeIndex,QTemporality] = answerTemporalities[2]
else:
temporality = answerTemporalities[2]
elif temporality == answerTemporalities[1] and afterIndex != 0 :
if _gfDF.loc[_index,QTimestamp] < _gfDF.loc[afterIndex,QTimestamp]:
_gfDF.loc[afterIndex,QTemporality] = answerTemporalities[2]
else:
temporality = answerTemporalities[2]
_gfDF.loc[_index,QTemporality] = temporality
if temporality == answerTemporalities[0]:
beforeIndex = _index
elif temporality == answerTemporalities[1]:
afterIndex = _index
temporalities[_userId] = [_firstEventDate, beforeIndex, afterIndex]
print("temporalities set (first event method)")
In [ ]:
# when did the user answer the questionnaire?
# After gameEventDate, before gameEventDate, undefined?
# answerDate is assumed to be the gform Timestamp, UTC
# gameEventDate is assumed to be of type pandas._libs.tslib.Timestamp, UTC, from RedMetrics
def getTemporality( answerDate, gameEventDate ):
result = answerTemporalities[2]
if(gameEventDate != pd.Timestamp.max.tz_localize('utc')):
if(answerDate <= gameEventDate):
result = answerTemporalities[0]
elif (answerDate > gameEventDate):
result = answerTemporalities[1]
return result
In [ ]:
# should be based on events on a 24h window
def setAnswerTemporalities2( _gfDF, _rmDF ):
# check whether temporalities have already been set
if(len(_gfDF[QTemporality].unique()) == 1):
# format : key = _userId, value = [pretestBeforeRatio, posttestAfterRatio, 0 or pretestIndex, 0 or posttestIndex]
temporalities = {}
for _index in _gfDF.index:
_userId = _gfDF.loc[_index,localplayerguidkey]
pretestBeforeRatio, posttestAfterRatio, pretestIndex, posttestIndex = [1.0, 1.0, 0, 0]
answerDate = _gfDF.loc[_index,QTimestamp]
[eventsBeforeRatio, eventsAfterRatio] = getEventCountRatios(answerDate, _userId, _rmDF, _gfDF)
if _userId in temporalities:
pretestBeforeRatio, posttestAfterRatio, pretestIndex, posttestIndex = temporalities[_userId]
if ((eventsBeforeRatio == eventsAfterRatio) and (0 != eventsBeforeRatio)):
print("anomaly for userId=" + _userId + ": eventsBeforeRatio == eventsAfterRatio != 0")
# update posttest if there are less events afterwards?
# keep the oldest anyways?
if (posttestIndex == 0) and (eventsBeforeRatio >= eventsAfterRatio) and (0 != eventsBeforeRatio):
# improvement idea:
#if (eventsBeforeRatio > eventsAfterRatio) :
# if (posttestIndex == 0) or (_gfDF.loc[posttestIndex,localplayerguidkey]):
# if _gfDF.loc[_index,QTimestamp] > _gfDF.loc[beforeIndex,QTimestamp]:
# if _gfDF.loc[_index,QTimestamp] < _gfDF.loc[afterIndex,QTimestamp]:
posttestAfterRatio = eventsAfterRatio
posttestIndex = _index
_gfDF.loc[_index,QTemporality] = answerTemporalities[1]
# update pretest if there are more events before?
# keep the oldest anyways?
elif (pretestIndex == 0) and (eventsBeforeRatio <= eventsAfterRatio) and (0 != eventsAfterRatio):
pretestBeforeRatio = eventsBeforeRatio
pretestIndex = _index
_gfDF.loc[_index,QTemporality] = answerTemporalities[0]
temporalities[_userId] = [pretestBeforeRatio, posttestAfterRatio, pretestIndex, posttestIndex]
print("temporalities set (ratio method)")
In [ ]:
def getEventCountRatios(answerDate, userId, _rmDF, _gfDF):
result = [0,0]
allEvents = _rmDF[_rmDF['userId']==userId]
allEventsCount = len(allEvents)
if 0 != allEventsCount:
eventsBeforeRatio = len(allEvents[allEvents['userTime'] < answerDate])/allEventsCount
eventsAfterRatio = len(allEvents[allEvents['userTime'] > answerDate])/allEventsCount
result = [eventsBeforeRatio, eventsAfterRatio]
return result
In [ ]:
def getCorrections( _userId, _gfDF, _source = correctAnswers, _columnAnswers = [] ):
if(len(_columnAnswers) == 0):
_columnAnswers = getAnswers( _userId, _gfDF = _gfDF )
if 0 != len(_columnAnswers.columns):
_questionsCount = len(_columnAnswers.values)
for _columnName in _columnAnswers.columns:
if answersColumnNameStem in _columnName:
_answerNumber = _columnName.replace(answersColumnNameStem,"")
newCorrectionsColumnName = correctionsColumnNameStem + _answerNumber
#_columnAnswers[newCorrectionsColumnName] = _columnAnswers[_columnName]
_columnAnswers[newCorrectionsColumnName] = pd.Series(np.full(_questionsCount, np.nan))
for question in _columnAnswers[_columnName].index:
_correctAnswers = _source.loc[question]
if(len(_correctAnswers) > 0):
_columnAnswers.loc[question,newCorrectionsColumnName] = False
for _correctAnswer in _correctAnswers:
if str(_columnAnswers.loc[question,_columnName])\
.startswith(str(_correctAnswer)):
_columnAnswers.loc[question,newCorrectionsColumnName] = True
break
else:
# user has never answered
print("can't give correct answers")
return _columnAnswers
# edits in-place
# _corrections must be a dataframe full of corrections as produced above
def getBinarizedCorrections( _corrections ):
for _columnName in _corrections.columns:
for _index in _corrections[_columnName].index:
if(True==_corrections.loc[_index,_columnName]):
_corrections.loc[_index,_columnName] = 1.0
elif (False==_corrections.loc[_index,_columnName]):
_corrections.loc[_index,_columnName] = 0.0
return _corrections
# only for one line in the gform
def getBinarized(_gfDFRow, _source = correctAnswers):
_notEmptyIndexes = []
for _index in _source.index:
if(len(_source.loc[_index]) > 0):
_notEmptyIndexes.append(_index)
_binarized = pd.Series(np.full(len(_gfDFRow.index), np.nan), index = _gfDFRow.index)
for question in _gfDFRow.index:
_correctAnswers = _source.loc[question]
if(len(_correctAnswers) > 0):
_binarized[question] = 0
for _correctAnswer in _correctAnswers:
if str(_gfDFRow.loc[question])\
.startswith(str(_correctAnswer)):
_binarized.loc[question] = 1
break
_slicedBinarized = _binarized.loc[_notEmptyIndexes]
return _slicedBinarized
def getAllBinarized(_gfDF, _source = correctAnswers):
_notEmptyIndexes = []
for _index in _source.index:
if(len(_source.loc[_index]) > 0):
_notEmptyIndexes.append(_index)
_result = pd.DataFrame(index = _notEmptyIndexes)
for _userId in getAllResponders(_gfDF = _gfDF):
_corrections = getCorrections(_userId, _source=_source, _gfDF = _gfDF)
_binarized = getBinarizedCorrections(_corrections)
_slicedBinarized =\
_binarized.loc[_notEmptyIndexes][_binarized.columns[\
_binarized.columns.to_series().str.contains(correctionsColumnNameStem)\
]]
_result = pd.concat([_result, _slicedBinarized], axis=1)
_result = _result.T
return _result
# CCA.iloc[i,j] is the number of users who correctly answered questions number i and j
# CCA[i,j] = Sum(A[u,i] * A[u,j], u in users) = Sum(tA[i,u] * A[u,j], u in users) = tA.A[i,j]
# CCA[i,j] is an int
def getCrossCorrectAnswers( _binarizedAnswers ):
return _binarizedAnswers.T.dot(_binarizedAnswers)
#function that returns the score from user id
scoreLabel = 'score'
def getScore( _userId, _gfDF, _source = correctAnswers ):
_score = pd.DataFrame({}, columns = answerTemporalities)
_score.loc[scoreLabel,:] = np.nan
for _column in _score.columns:
_score.loc[scoreLabel, _column] = []
if hasAnswered(_userId, _gfDF):
_columnAnswers = getCorrections(_userId, _gfDF = _gfDF, _source = _source)
for _columnName in _columnAnswers.columns:
# only work on corrected columns
if correctionsColumnNameStem in _columnName:
_answerColumnName = _columnName.replace(correctionsColumnNameStem,\
answersColumnNameStem)
_temporality = _columnAnswers.loc[QTemporality,_answerColumnName]
_counts = (_columnAnswers[_columnName]).value_counts()
_thisScore = 0
if(True in _counts):
_thisScore = _counts[True]
_score.loc[scoreLabel,_temporality].append(_thisScore)
else:
print("user " + str(_userId) + " has never answered")
return _score
def getGFormRowCorrection(_gfDFRow, _source = correctAnswers):
result = _gfDFRow.copy()
if(len(_gfDFRow) == 0):
print("this gform row is empty")
else:
result = pd.Series(index = _gfDFRow.index, data = np.full(len(_gfDFRow), np.nan))
for question in result.index:
_correctAnswers = _source.loc[question]
if(len(_correctAnswers) > 0):
result.loc[question] = False
for _correctAnswer in _correctAnswers:
if str(_gfDFRow.loc[question]).startswith(str(_correctAnswer)):
result.loc[question] = True
break
return result
def getGFormRowScore( _gfDFRow, _source = correctAnswers):
correction = getGFormRowCorrection( _gfDFRow, _source = _source)
_counts = correction.value_counts()
_thisScore = 0
if(True in _counts):
_thisScore = _counts[True]
return _thisScore
In [ ]:
QCuriosityCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Énormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
QCuriosityBiologyCoding = QCuriosityCoding
QCuriositySyntheticBiologyCoding = QCuriosityCoding
QCuriosityVideoGamesCoding = QCuriosityCoding
QCuriosityEngineeringCoding = QCuriosityCoding
QPlayedCoding = {"I played it multiple times recently": 3, "I played it multiple times recently on this computer": 3, "I played recently on an other computer": 2, "I played it some time ago": 1, "I just played for the first time": 1, "No / not yet": 0, "I don't know": 0}
#QAgeCoding
QGenderCoding = {"Female": 1, "Other": 0, "Prefer not to say": 0, "Male": -1}
QInterestVideoGamesCoding = QCuriosityCoding
QInterestBiologyCoding = QCuriosityCoding
QStudiedBiologyCoding = {"Not even in middle school": 0, "Jamais": 0, "Jamais, pas même au collège": 0, "Until the end of middle school": 1, "Jusqu'au brevet": 1, "Until the end of high school": 2, "Jusqu'au bac": 2, "Until bachelor's degree": 3, "Jusqu'à la license": 3, "At least until master's degree": 4, "Au moins jusqu'au master": 4, "I don't know": 0, "Je ne sais pas": 0}
QPlayVideoGamesCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Énormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Rarely": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
QHeardSynBioOrBioBricksCoding = {"Yes, and I know what it means" : 2, "Yes, but I don't exactly know what it means": 1, "No": 0}
QVolunteerCoding = {"Yes": 1, "No": 0}
QEnjoyedCoding = {'Extremely': 4, 'A lot': 3, 'Not at all': 0, 'A bit': 1, 'Moderately': 2, "No": 0, "Not applicable: not played yet": -1}
QLanguageCoding = {"en": 0, "fr": 1}
QTemporalityCoding = {"pretest": 0, "posttest": 1, "undefined": -5}
numericDemographicQuestionsCodings = [
QCuriosityBiologyCoding,
QCuriositySyntheticBiologyCoding,
QCuriosityVideoGamesCoding,
QCuriosityEngineeringCoding,
QPlayedCoding,
QGenderCoding,
QInterestVideoGamesCoding,
QInterestBiologyCoding,
QStudiedBiologyCoding,
QPlayVideoGamesCoding,
QHeardSynBioOrBioBricksCoding,
QVolunteerCoding,
QEnjoyedCoding,
QLanguageCoding,
QTemporalityCoding,
]
numericDemographicQuestions = [
QCuriosityBiology,
QCuriositySyntheticBiology,
QCuriosityVideoGames,
QCuriosityEngineering,
QPlayed,
QGender,
QInterestVideoGames,
QInterestBiology,
QStudiedBiology,
QPlayVideoGames,
QHeardSynBioOrBioBricks,
QVolunteer,
QEnjoyed,
QLanguage,
QTemporality,
]
numericDemographicQuestionsCodingsSeries = pd.Series(data = numericDemographicQuestionsCodings, index = numericDemographicQuestions)
In [ ]:
# only for one line in the gform
def getNumeric(_gfDFRow, _source = correctAnswers):
_notEmptyIndexes = []
for _index in _source.index:
if(len(_source.loc[_index]) > 0):
_notEmptyIndexes.append(_index)
_numeric = pd.Series(np.full(len(_gfDFRow.index), np.nan), index = _gfDFRow.index)
for question in _gfDFRow.index:
if question in scientificQuestions:
_correctAnswers = _source.loc[question]
if(len(_correctAnswers) > 0):
_numeric[question] = 0
for _correctAnswer in _correctAnswers:
if str(_gfDFRow.loc[question])\
.startswith(str(_correctAnswer)):
_numeric.loc[question] = 1
break
elif question == QAge:
if pd.notnull(_gfDFRow.loc[question]):
_numeric.loc[question] = float(_gfDFRow.loc[question])
else:
_numeric.loc[question] = -1
elif question in demographicQuestions:
if pd.notnull(_gfDFRow.loc[question]):
_numeric.loc[question] = numericDemographicQuestionsCodingsSeries.loc[question][_gfDFRow.loc[question]]
else:
_numeric.loc[question] = 0
_slicedBinarized = _numeric.loc[_notEmptyIndexes]
return _slicedBinarized
In [ ]:
def createStatSet(series, ids = pd.Series()):
if(0 == len(ids)):
ids = series.index
result = {
'count' : len(ids),
'unique' : len(ids.unique()),
'median' : series.median(),
'mean' : series.mean(),
'std' : series.std(),
}
return result
# _binarized must be well-formed, similarly to getAllBinarized's output
def getPercentagePerQuestion(_binarized):
totalPerQuestionDF = pd.DataFrame(data=np.dot(np.ones(_binarized.shape[0]), _binarized), index=_binarized.columns)
percentagePerQuestion = totalPerQuestionDF*100 / _binarized.shape[0]
return percentagePerQuestion
In [ ]:
## gfDF can be: all, those who answered both before and after,
## those who played between date1 and date2, ...
from scipy.stats import ttest_ind
def plotBasicStats(
gfDF,
title = np.nan,
includeAll = False,
includeBefore = True,
includeAfter = True,
includeUndefined = False,
includeProgress = True,
includeRelativeProgress = False,
horizontalPlot = True,
sortedAlong = '', # in ["pretest", "posttest", "progression"]
figsize=(20,4),
annot=True,
cbar=True,
annot_kws={"size": 10},
font_scale=1,
):
stepsPerInclude = 2
includeCount = np.sum([includeAll, includeBefore, includeAfter, includeUndefined, includeProgress])
stepsCount = stepsPerInclude*includeCount + 3
#print("stepsPerInclude=" + str(stepsPerInclude))
#print("includeCount=" + str(includeCount))
#print("stepsCount=" + str(stepsCount))
__progress = FloatProgress(min=0, max=stepsCount)
display(__progress)
gfDFPretests = gfDF[gfDF[QTemporality] == answerTemporalities[0]]
gfDFPosttests = gfDF[gfDF[QTemporality] == answerTemporalities[1]]
gfDFUndefined = gfDF[gfDF[QTemporality] == answerTemporalities[2]]
#uniqueBefore = gfDFPretests[localplayerguidkey]
#uniqueAfter =
#uniqueUndefined =
scientificQuestionsSource = correctAnswers.copy()
allQuestionsSource = correctAnswers + demographicAnswers
categories = ['all', answerTemporalities[0], answerTemporalities[1], answerTemporalities[2],\
'progress', 'rel. progress']
data = {}
sciBinarized = pd.DataFrame()
allBinarized = pd.DataFrame()
scoresAll = pd.DataFrame()
sciBinarizedBefore = pd.DataFrame()
allBinarizedBefore = pd.DataFrame()
scoresBefore = pd.DataFrame()
sciBinarizedAfter = pd.DataFrame()
allBinarizedAfter = pd.DataFrame()
scoresAfter = pd.DataFrame()
sciBinarizedUndefined = pd.DataFrame()
allBinarizedUndefined = pd.DataFrame()
scoresUndefined = pd.DataFrame()
scoresProgress = pd.DataFrame()
## basic stats:
### mean score
### median score
### std
if includeAll:
sciBinarized = getAllBinarized(gfDF, _source = scientificQuestionsSource)
__progress.value += 1
allBinarized = getAllBinarized(gfDF, _source = allQuestionsSource)
__progress.value += 1
scoresAll = pd.Series(np.dot(sciBinarized, np.ones(sciBinarized.shape[1])))
data[categories[0]] = createStatSet(scoresAll, gfDF[localplayerguidkey])
if includeBefore or includeProgress:
sciBinarizedBefore = getAllBinarized(gfDFPretests, _source = scientificQuestionsSource)
__progress.value += 1
allBinarizedBefore = getAllBinarized(gfDFPretests, _source = allQuestionsSource)
__progress.value += 1
scoresBefore = pd.Series(np.dot(sciBinarizedBefore, np.ones(sciBinarizedBefore.shape[1])))
temporaryStatSetBefore = createStatSet(scoresBefore, gfDFPretests[localplayerguidkey])
if includeBefore:
data[categories[1]] = temporaryStatSetBefore
if includeAfter or includeProgress:
sciBinarizedAfter = getAllBinarized(gfDFPosttests, _source = scientificQuestionsSource)
__progress.value += 1
allBinarizedAfter = getAllBinarized(gfDFPosttests, _source = allQuestionsSource)
__progress.value += 1
scoresAfter = pd.Series(np.dot(sciBinarizedAfter, np.ones(sciBinarizedAfter.shape[1])))
temporaryStatSetAfter = createStatSet(scoresAfter, gfDFPosttests[localplayerguidkey])
if includeAfter:
data[categories[2]] = temporaryStatSetAfter
if includeUndefined:
sciBinarizedUndefined = getAllBinarized(gfDFUndefined, _source = scientificQuestionsSource)
__progress.value += 1
allBinarizedUndefined = getAllBinarized(gfDFUndefined, _source = allQuestionsSource)
__progress.value += 1
scoresUndefined = pd.Series(np.dot(sciBinarizedUndefined, np.ones(sciBinarizedUndefined.shape[1])))
data[categories[3]] = createStatSet(scoresUndefined, gfDFUndefined[localplayerguidkey])
if includeProgress:
data[categories[4]] = {
'count' : min(temporaryStatSetAfter['count'], temporaryStatSetBefore['count']),
'unique' : min(temporaryStatSetAfter['unique'], temporaryStatSetBefore['unique']),
'median' : temporaryStatSetAfter['median']-temporaryStatSetBefore['median'],
'mean' : temporaryStatSetAfter['mean']-temporaryStatSetBefore['mean'],
'std' : temporaryStatSetAfter['std']-temporaryStatSetBefore['std'],
}
__progress.value += 2
result = pd.DataFrame(data)
__progress.value += 1
print(title)
print(result)
if (includeBefore and includeAfter) or includeProgress:
if (len(scoresBefore) > 2 and len(scoresAfter) > 2):
ttest = ttest_ind(scoresBefore, scoresAfter)
print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))
print()
## percentage correct
### percentage correct - max 5 columns
percentagePerQuestionAll = pd.DataFrame()
percentagePerQuestionBefore = pd.DataFrame()
percentagePerQuestionAfter = pd.DataFrame()
percentagePerQuestionUndefined = pd.DataFrame()
percentagePerQuestionProgress = pd.DataFrame()
tables = []
if includeAll:
percentagePerQuestionAll = getPercentagePerQuestion(allBinarized)
tables.append([percentagePerQuestionAll, categories[0]])
if includeBefore or includeProgress:
percentagePerQuestionBefore = getPercentagePerQuestion(allBinarizedBefore)
if includeBefore:
tables.append([percentagePerQuestionBefore, categories[1]])
if includeAfter or includeProgress:
percentagePerQuestionAfter = getPercentagePerQuestion(allBinarizedAfter)
if includeAfter:
tables.append([percentagePerQuestionAfter, categories[2]])
if includeUndefined:
percentagePerQuestionUndefined = getPercentagePerQuestion(allBinarizedUndefined)
tables.append([percentagePerQuestionUndefined, categories[3]])
if includeProgress or includeRelativeProgress:
percentagePerQuestionProgress = percentagePerQuestionAfter - percentagePerQuestionBefore
if includeProgress:
tables.append([percentagePerQuestionProgress, categories[4]])
if includeRelativeProgress:
# use temporaryStatSetAfter['count'], temporaryStatSetBefore['count']?
percentagePerQuestionProgress2 = percentagePerQuestionProgress.copy()
for index in range(0,len(percentagePerQuestionProgress.index)):
if (0 == percentagePerQuestionBefore.iloc[index,0]):
percentagePerQuestionProgress2.iloc[index,0] = 0
else:
percentagePerQuestionProgress2.iloc[index,0] = \
percentagePerQuestionProgress.iloc[index,0]/percentagePerQuestionBefore.iloc[index,0]
tables.append([percentagePerQuestionProgress2, categories[5]])
__progress.value += 1
graphTitle = '% correct: '
toConcat = []
for table,category in tables:
concat = (len(table.values) > 0)
for elt in table.iloc[:,0].values:
if np.isnan(elt):
concat = False
break
if(concat):
graphTitle = graphTitle + category + ' '
toConcat.append(table)
if (len(toConcat) > 0):
percentagePerQuestionConcatenated = pd.concat(
toConcat
, axis=1)
if(pd.notnull(title) > 0):
graphTitle = graphTitle + ' - ' + title
_fig = plt.figure(figsize=figsize)
_ax1 = plt.subplot(111)
if pd.isnull(title):
_ax1.set_title(graphTitle)
else:
_ax1.set_title(title)
matrixToDisplay = percentagePerQuestionConcatenated.round().astype(int)
matrixToDisplay.columns = ["pretest", "posttest", "progression"]
if sortedAlong in matrixToDisplay.columns:
demographicQuestions = demographicAnswers[demographicAnswers.apply(len) > 0].index
sciSorted = matrixToDisplay.loc[scientificQuestions, :].sort_values(by = sortedAlong, ascending = True)
demoSorted = matrixToDisplay.loc[demographicQuestions, :].sort_values(by = sortedAlong, ascending = True)
matrixToDisplay = pd.concat([sciSorted, demoSorted])
if horizontalPlot:
matrixToDisplay = matrixToDisplay.T
sns.set(font_scale=font_scale)
sns.heatmap(
matrixToDisplay,
ax=_ax1,
cmap=plt.cm.jet,
square=True,
annot=annot,
fmt='d',
vmin=0,
vmax=100,
cbar=cbar,
annot_kws=annot_kws,
)
#if horizontalPlot:
# both fail
#heatmap.set_xticklabels(_ax1.get_xticklabels(),rotation=45)
#plt.xticks(rotation=45)
__progress.value += 1
### percentage cross correct
### percentage cross correct, conditionnally
if(__progress.value != stepsCount):
print("__progress.value=" + str(__progress.value) + " != stepsCount=" + str(stepsCount))
__progress.close()
del __progress
# return sciBinarized, sciBinarizedBefore, sciBinarizedAfter, sciBinarizedUndefined, \
# allBinarized, allBinarizedBefore, allBinarizedAfter, allBinarizedUndefined
return matrixToDisplay
In [ ]:
def plotCorrelationMatrices(
allBinarized = [],
beforeBinarized = [],
afterBinarized = [],
undefinedBinarized = [],
titleAll = 'Correlation of pre- & post-test answers',
titleBefore = 'Correlation of pre-test answers',
titleAfter = 'Correlation of post-test answers',
titleUndefined = 'Correlation of undefined answers',
titleSuffix = '',
):
dataBinarized = [allBinarized, beforeBinarized, afterBinarized, undefinedBinarized]
titles = [titleAll + titleSuffix, titleBefore + titleSuffix, titleAfter + titleSuffix, titleUndefined + titleSuffix]
for index in range(0, len(dataBinarized)):
if(len(dataBinarized[index]) > 0):
plotCorrelationMatrix(
dataBinarized[index],
_abs=True,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title=titles[index],
)
##correlation
### simple heatmap
### clustermap
methods = ['pearson', 'kendall', 'spearman']
def plotCorrelationMatrix(
_binarizedMatrix,
_method = methods[0],
_title='Questions\' Correlations',
_abs=False,
_clustered=False,
_questionNumbers=False,
_annot = False,
_figsize = (10,10),
_metric='euclidean'
):
_progress = FloatProgress(min=0, max=7)
display(_progress)
_overlay = False
_progress.value += 1
# computation of correlation matrix
_m = _method
if(not (_method in methods)):
_m = methods[0]
_correlation = _binarizedMatrix.astype(float).corr(_m)
_progress.value += 1
if(_abs):
_correlation = _correlation.abs()
_progress.value += 1
if(_clustered):
# removing NaNs
# can't cluster NaN lines in _correlation
_notNaNsIndices = []
_notNaNsColumns = []
for index in _correlation.index:
#if(pd.notnull(_correlation.loc[index,:]).all()): # if no element is nan
if(~pd.isnull(_correlation.loc[index,:]).all()): # if at least one element is not nan
_notNaNsIndices.append(index)
#for column in _correlation.columns:
# if(~np.isnan(_correlation.loc[:,column]).all()):
# _notNaNsColumns.append(column)
_binarizedMatrix = _binarizedMatrix.loc[:,_notNaNsIndices]
_correlation = _correlation.loc[_notNaNsIndices,_notNaNsIndices]
_progress.value += 1
# optional computation of overlay
if(_annot):
_overlay = getCrossCorrectAnswers(_binarizedMatrix).astype(int)
_progress.value += 1
# preparation of plot labels
if(_questionNumbers):
_correlation.columns = pd.Series(_correlation.columns).apply(\
lambda x: x + ' #' + str(_correlation.columns.get_loc(x) + 1))
if(_clustered):
_correlation.index = pd.Series(_correlation.columns).apply(\
lambda x: '#' + str(_correlation.columns.get_loc(x) + 1) + ' ' + x)
else:
_correlation.index = _correlation.columns
_progress.value += 1
vmin = -1
if _abs:
vmin = 0
vmax = 1
# plot
if(_clustered):
result = sns.clustermap(
_correlation,
metric=_metric,
cmap=plt.cm.jet,
square=True,
figsize=_figsize,
annot=_overlay,
fmt='d',
vmin=vmin,
vmax=vmax,
)
return result, _overlay
# if(_annot):
# reorder columns using clustergrid.dendrogram_col.reordered_ind
#_overlay1 = _overlay.copy()
# reorderedCols = result.dendrogram_col.reordered_ind
# _overlay = _overlay
#_overlay2 = _overlay.copy().iloc[reorderedCols,reorderedCols]
# result = sns.clustermap(_correlation,metric=_metric,cmap=plt.cm.jet,square=True,figsize=_figsize,annot=_overlay, fmt='d')
#print(_overlay1.columns == _overlay2.columns)
#print(_overlay1 == _overlay2)
#print(_overlay1.columns)
#print(_overlay1.columns)
#print(_overlay1)
#print(_overlay2)
#return _overlay1, _overlay2
# return result, _overlay
else:
_fig = plt.figure(figsize=_figsize)
_ax = plt.subplot(111)
_ax.set_title(_title)
sns.heatmap(
_correlation,
ax=_ax,
cmap=plt.cm.jet,
square=True,
annot=_overlay,
fmt='d',
vmin=vmin,
vmax=vmax,
)
_progress.close()
del _progress
#def plotAll():
# loop on question types
# loop on temporalities
# loop on representations
## basic stats:
### mean score
### median score
### std
## percentage correct
### percentage correct - 3 columns
### percentage cross correct
### percentage cross correct, conditionnally
##correlation
### simple heatmap
# plotCorrelationMatrix
### clustermap
# plotCorrelationMatrix
In [ ]:
def plotSamples(gfDFs):
_progress = FloatProgress(min=0, max=len(gfDFs))
display(_progress)
for gfDF, title in gfDFs:
plotBasicStats(gfDF, title)
_progress.value += 1
if(_progress.value != len(gfDFs)):
print("__progress.value=" + str(__progress.value) + " != len(gfDFs)=" + str(len(gfDFs)))
_progress.close()
del _progress
In [ ]:
# for per-gform, manual analysis
def getGFormDataPreview(_GFUserId, gfDF):
gforms = gform[gform[localplayerguidkey] == _GFUserId]
result = {}
for _ilocIndex in range(0, len(gforms)):
gformsIndex = gforms.index[_ilocIndex]
currentGForm = gforms.iloc[_ilocIndex]
subresult = {}
subresult['date'] = currentGForm[QTimestamp]
subresult['temporality RM'] = currentGForm[QTemporality]
subresult['temporality GF'] = getGFormRowGFormTemporality(currentGForm)
subresult['score'] = getGFormRowScore(currentGForm)
subresult['genderAge'] = [currentGForm[QGender], currentGForm[QAge]]
# search for other users with similar demographics
matchingDemographics = getMatchingDemographics(gfDF, currentGForm)
matchingDemographicsIds = []
#print(type(matchingDemographics))
#print(matchingDemographics.index)
for matchesIndex in matchingDemographics.index:
matchingDemographicsIds.append([matchesIndex, matchingDemographics.loc[matchesIndex, localplayerguidkey]])
subresult['demographic matches'] = matchingDemographicsIds
result['survey' + str(_ilocIndex)] = subresult
return result
In [ ]:
# indices do not need to be reset as they all come from gform
def getUnionQuestionnaires(gfDF1, gfDF2):
if (not (gfDF1.columns == gfDF2.columns).all()):
print("warning: parameter columns are not the same")
return pd.concat([gfDF1, gfDF2]).drop_duplicates()
In [ ]:
# indices do not need to be reset as they all come from gform
def getIntersectionQuestionnaires(gfDF1, gfDF2):
if (not (gfDF1.columns == gfDF2.columns).all()):
print("warning: parameter columns are not the same")
return pd.merge(gfDF1, gfDF2, how = 'inner').drop_duplicates()
In [ ]:
# get gfDF1 and gfDF2 rows where users are common to gfDF1 and gfDF2
def getIntersectionUsersSurveys(gfDF1, gfDF2):
result1 = gfDF1[gfDF1[localplayerguidkey].isin(gfDF2[localplayerguidkey])]
result2 = gfDF2[gfDF2[localplayerguidkey].isin(gfDF1[localplayerguidkey])]
return getUnionQuestionnaires(result1,result2)
In [ ]:
gform[QPlayed].unique()
In [ ]:
def getRMBefores(gfDF):
return gfDF[gfDF[QTemporality] == answerTemporalities[0]]
In [ ]:
def getRMAfters(gfDF):
return gfDF[gfDF[QTemporality] == answerTemporalities[1]]
In [ ]:
# returns users who declared that they have never played the game, whatever platform
# everPlayedPositives is defined in "../Functions/0.1 GF English localization.ipynb"
def getGFormBefores(gfDF):
return gfDF[
~gfDF[QPlayed].isin(everPlayedPositives)
]
In [ ]:
def isGFormBefore(surveyAnswerIndex, _gform):
return (len(getGFormBefores(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)
In [ ]:
# returns users who declared that they have already played the game, whatever platform
# everPlayedPositives is defined in "../Functions/0.1 GF English localization.ipynb"
def getGFormAfters(gfDF):
return gfDF[
gfDF[QPlayed].isin(everPlayedPositives)
]
In [ ]:
def isGFormAfter(surveyAnswerIndex, _gform):
return (len(getGFormAfters(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)
In [ ]:
# returns an element of answerTemporalities
# everPlayedPositives is defined in '../Static data/English localization.ipynb'
def getGFormRowGFormTemporality(_gfDFRow):
if (_gfDFRow[QPlayed] in everPlayedPositives):
return answerTemporalities[1]
else:
return answerTemporalities[0]
In [ ]:
def getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = True, rmMode = False):
befores = gfDF
afters = gfDF
if gfMode:
befores = getGFormBefores(befores)
afters = getGFormAfters(afters)
if rmMode:
befores = getRMBefores(befores)
afters = getRMAfters(afters)
return getIntersectionUsersSurveys(befores, afters)
In [ ]:
def getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, hardPolicy = True):
filterSeries = []
if hardPolicy:
filterSeries = pd.Series(True, gfDF.index)
for question, positiveAnswers in questionsAndPositiveAnswers:
filterSeries = filterSeries & (gfDF[question].isin(positiveAnswers))
else:
filterSeries = pd.Series(False, range(len(gfDF.index)))
for question, positiveAnswers in questionsAndPositiveAnswers:
filterSeries = filterSeries | (gfDF[question].isin(positiveAnswers))
return gfDF[filterSeries]
In [ ]:
# surveys of people who have studied biology, and/or know about synthetic biology, and/or about BioBricks
def getSurveysOfBiologists(gfDF, hardPolicy = True):
#QStudiedBiology biologyStudyPositives
#irrelevant QInterestBiology biologyInterestPositives
#QHeardSynBioOrBioBricks heardAboutBioBricksPositives
questionsAndPositiveAnswers = [[QStudiedBiology, biologyStudyPositives],
[QHeardSynBioOrBioBricks, heardAboutBioBricksPositives]]
return getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, hardPolicy)
In [ ]:
# surveys of people who play video games and/or are interested in them
def getSurveysOfGamers(gfDF, hardPolicy = True):
#QInterestVideoGames interestPositives
#QPlayVideoGames frequencyPositives
questionsAndPositiveAnswers = [[QInterestVideoGames, interestPositives], [QPlayVideoGames, frequencyPositives]]
return getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, hardPolicy)
In [ ]:
def getSurveysWithMatchingAnswers(gfDF, _gfDFRow, strictList, extendedList = [], hardPolicy = False):
questions = strictList
if (hardPolicy):
questions += extendedList
questionsAndPositiveAnswers = []
for q in questions:
questionsAndPositiveAnswers.append([q, [_gfDFRow[q]]])
return getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, True)
In [ ]:
#QAge
#QGender
def getMatchingDemographics(gfDF, _gfDFRow, hardPolicy = False):
# age and gender, edu should not change
#QGender
#QAge
#QStudiedBiology
# interests, hobbies, and knowledge - evaluation may vary after playing
#QInterestVideoGames
#QPlayVideoGames
#QInterestBiology
#QHeardSynBioOrBioBricks heardAboutBioBricksPositives
# language may vary: players may have missed the opportunity to set it, or may want to try and change it
#QLanguage
return getSurveysWithMatchingAnswers(
gfDF,
_gfDFRow, [QAge, QGender, QStudiedBiology],
extendedList = [QInterestVideoGames, QPlayVideoGames, QInterestBiology, QHeardSynBioOrBioBricks, QLanguage],
hardPolicy = hardPolicy
)
In [ ]:
def getDemographicSamples(gfDF):
gfDFs = [
[gfDF, 'root gfDF'],
[gfDF[gfDF[QLanguage] == enLanguageID], 'English'],
[gfDF[gfDF[QLanguage] == frLanguageID], 'French'],
[gfDF[gfDF[QGender] == 'Female'], 'female'],
[gfDF[gfDF[QGender] == 'Male'], 'male'],
[getSurveysOfBiologists(gfDF), 'biologists - strict'],
[getSurveysOfBiologists(gfDF, False), 'biologists - broad'],
[getSurveysOfGamers(gfDF), 'gamers - strict'],
[getSurveysOfGamers(gfDF, False), 'gamers - broad'],
]
return gfDFs
In [ ]:
def getTemporalitySamples(gfDF):
gfDFs = [
[gfDF, 'root gfDF'],
[getRMBefores(gfDF), 'RedMetrics befores'],
[getGFormBefores(gfDF), 'Google form befores'],
[getRMBefores(getGFormBefores(gfDF)), 'GF & RedMetrics befores'],
[getRMAfters(gfDF), 'RedMetrics afters'],
[getGFormAfters(gfDF), 'Google form afters'],
[getRMAfters(getGFormAfters(gfDF)), 'GF & RedMetrics afters'],
[getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = True, rmMode = False), 'GF both before and after'],
[getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = False, rmMode = True), 'RM both before and after'],
[getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = True, rmMode = True), 'GF & RM both before and after'],
]
return gfDFs
In [ ]:
#function that returns the list of checkpoints from user id
def getValidatedCheckpoints( userId, _gfDF ):
_validatedCheckpoints = []
if hasAnswered(userId, _gfDF):
_columnAnswers = getCorrections( userId, _gfDF = _gfDF)
for _columnName in _columnAnswers.columns:
# only work on corrected columns
if correctionsColumnNameStem in _columnName:
_questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(len(checkpointQuestionMatching)))
for _index in range(0, len(_questionnaireValidatedCheckpointsPerQuestion)):
if _columnAnswers[_columnName][_index]==True:
_questionnaireValidatedCheckpointsPerQuestion[_index] = checkpointQuestionMatching['checkpoint'][_index]
else:
_questionnaireValidatedCheckpointsPerQuestion[_index] = ''
_questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpointsPerQuestion.unique()
_questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints[_questionnaireValidatedCheckpoints!='']
_questionnaireValidatedCheckpoints = pd.Series(_questionnaireValidatedCheckpoints)
_questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints.sort_values()
_questionnaireValidatedCheckpoints.index = range(0, len(_questionnaireValidatedCheckpoints))
_validatedCheckpoints.append(_questionnaireValidatedCheckpoints)
else:
print("user " + str(userId) + " has never answered")
return pd.Series(_validatedCheckpoints)
def getValidatedCheckpointsCounts( _userId, _gfDF ):
_validatedCheckpoints = getValidatedCheckpoints(_userId, _gfDF = _gfDF)
_counts = []
for checkpointsList in _validatedCheckpoints:
_counts.append(len(checkpointsList))
return _counts
def getNonValidated( checkpoints ):
_validationLists = []
if 0!=len(checkpoints):
for _validation in checkpoints:
_result = pd.Series(np.setdiff1d(validableCheckpoints.values, _validation.values))
_result = _result[_result != '']
_result.index = range(0, len(_result))
_validationLists.append(_result)
return pd.Series(_validationLists)
else:
return validableCheckpoints
def getNonValidatedCheckpoints( userId, _gfDF ):
validated = getValidatedCheckpoints( userId, _gfDF = _gfDF )
return getNonValidated(validated)
def getNonValidatedCheckpointsCounts( userId, _gfDF ):
_nonValidatedCheckpoints = getNonValidatedCheckpoints(userId, _gfDF = _gfDF)
_counts = []
for checkpointsList in _nonValidatedCheckpoints:
_counts.append(len(checkpointsList))
return _counts
In [ ]:
# returns all rows of Google form's answers that contain an element
# of the array 'choice' for question number 'questionIndex'
def getAllAnswerRows(questionIndex, choice, _gfDF ):
return _gfDF[_gfDF.iloc[:, questionIndex].isin(choice)]
def getPercentCorrectPerColumn(_df):
_count = len(_df)
_percents = pd.Series(np.full(len(_df.columns), np.nan), index=_df.columns)
for _rowIndex in _df.index:
for _columnName in _df.columns:
_columnIndex = _df.columns.get_loc(_columnName)
if ((_columnIndex >= firstEvaluationQuestionIndex) \
and (_columnIndex < len(_df.columns)-3)):
if(str(_df[_columnName][_rowIndex]).startswith(str(correctAnswers[_columnIndex]))):
if (np.isnan(_percents[_columnName])):
_percents[_columnName] = 1;
else:
_percents[_columnName] = _percents[_columnName]+1
else:
if (np.isnan(_percents[_columnName])):
_percents[_columnName] = 0;
_percents = _percents/_count
_percents['Count'] = _count
return _percents
def getPercentCorrectKnowingAnswer(questionIndex, choice, _gfDF):
_answerRows = getAllAnswerRows(questionIndex, choice, _gfDF = _gfDF);
return getPercentCorrectPerColumn(_answerRows)
In [ ]:
def getTestAnswers( _gfDF, _rmDF, _rmTestDF = normalizedRMDFTest, includeAndroid = True):
return _gfDF[_gfDF[localplayerguidkey].isin(testUsers.values.flatten())]
In [ ]:
# ambiguous answer to QPlayed
AUnclassifiable = 'I played recently on an other computer'
# fill posttests with pretest data
def setPosttestsProfileInfo(_gfDF):
# check whether temporalities have already been set
if(len(_gfDF[QTemporality].unique()) == 1):
print("temporalities not set")
else:
intProgress = IntProgress(min=0, max=len(_gfDF.index))
display(intProgress)
#_gfDF[_gfDF[QTemporality] == answerTemporalities[1]][QAge]
for _index in _gfDF.index:
intProgress.value += 1
if ((_gfDF.loc[_index, QTemporality] == answerTemporalities[0])
or
(_gfDF.loc[_index, QTemporality] == answerTemporalities[1]
and
_gfDF.loc[_index, QPlayed] == AUnclassifiable
)
):
if pd.isnull(_gfDF.loc[_index, survey1522DF[profileColumn]]).any():
print("nan for index " + str(_index))
else:
# fix on age loading
_gfDF.loc[_index, QAge] = int(_gfDF.loc[_index, QAge])
thisUserIdsPostests = _gfDF.loc[
(_gfDF['userId'] == _gfDF.loc[_index, 'userId'])
&
(_gfDF[QTemporality] == answerTemporalities[1])
]
if(len(thisUserIdsPostests) > 0):
_gfDF.loc[
(_gfDF['userId'] == _gfDF.loc[_index, 'userId'])
&
(_gfDF[QTemporality] == answerTemporalities[1])
,survey1522DF[profileColumn]] = _gfDF.loc[_index, survey1522DF[profileColumn]].values
intProgress.close()
del intProgress
print("profile info set")
In [ ]:
lastAddedColumn = 'lastAdded'
profileColumn = 'profile'
commonColumn = 'common'
compulsoryPretestColumn = 'compulsoryPretest'
optionalPretestColumn = 'optionalPretest'
compulsoryPosttestColumn = 'compulsoryPosttest'
#QVolunteer
QContent = QBioBricksDevicesComposition
#QRemarks
def getQuestionTypes():
intProgress = IntProgress(min=0, max=2*len(gform.index))
display(intProgress)
survey1522DF = pd.DataFrame(index = gform.columns, data = False,
columns = [lastAddedColumn, commonColumn, compulsoryPretestColumn,compulsoryPosttestColumn])
pretestQuestions = pd.Index([])
pretestNotVolunteeredQuestions = pd.Index([])
posttestQuestions = pd.Index([])
lastAddedQuestions = pd.Index([])
for answerIndex in gform.index:
intProgress.value += 1
answer = gform.iloc[answerIndex,:]
if gform.loc[answerIndex, QTemporality] == answerTemporalities[0]:
# has volunteered?
if gform.loc[answerIndex, QVolunteer] in yesNoPositives:
pretestQuestions = pretestQuestions.union(answer[pd.notnull(answer[:])].index)
else:
pretestNotVolunteeredQuestions = pretestNotVolunteeredQuestions.union(answer[pd.notnull(answer[:])].index)
elif gform.loc[answerIndex, QPlayed] != APlayedButProfileAgain:
posttestQuestions = posttestQuestions.union(answer[pd.notnull(answer[:])].index)
survey1522DF[compulsoryPretestColumn] = survey1522DF.index.isin(pretestNotVolunteeredQuestions)
survey1522DF[optionalPretestColumn] = survey1522DF.index.isin(pretestQuestions.difference(pretestNotVolunteeredQuestions))
survey1522DF[compulsoryPosttestColumn] = survey1522DF.index.isin(posttestQuestions)
survey1522DF[commonColumn] = (survey1522DF[compulsoryPretestColumn] & survey1522DF[compulsoryPosttestColumn])
for answerIndex in gform.index:
intProgress.value += 1
answer = gform.iloc[answerIndex,:]
if gform.loc[answerIndex, QTemporality] == answerTemporalities[0]:
# has volunteered?
if gform.loc[answerIndex, QVolunteer] in yesNoPositives:
lastAddedQuestions = lastAddedQuestions.union(answer[pretestQuestions][pd.isnull(answer[pretestQuestions])].index)
else:
lastAddedQuestions = lastAddedQuestions.union(answer[pretestNotVolunteeredQuestions][pd.isnull(answer[pretestNotVolunteeredQuestions])].index)
elif not pd.isnull(gform.loc[answerIndex, QContent]):
lastAddedQuestions = lastAddedQuestions.union(answer[posttestQuestions][pd.isnull(answer[posttestQuestions])].index)
survey1522DF[lastAddedColumn] = survey1522DF.index.isin(lastAddedQuestions)
# manual override
survey1522DF.loc[QRemarks] = False
survey1522DF[profileColumn] = survey1522DF[compulsoryPretestColumn] & (~survey1522DF[compulsoryPosttestColumn])
intProgress.close()
del intProgress
return survey1522DF
In [ ]:
def getPosttestsWithoutPretests(_gfDF):
pretestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[0]]['userId']
posttestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[1]]['userId']
return posttestIds[~posttestIds.isin(pretestIds)].index
def getPretestsWithoutPosttests(_gfDF):
pretestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[0]]['userId']
posttestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[1]]['userId']
return pretestIds[~pretestIds.isin(posttestIds)].index
In [ ]:
def getWithoutIncompleteAnswers(_gfDF):
# remove incomplete profiles
# coincidentally removes posttests that don't have matching pretests
_gfDF2 = _gfDF.drop(_gfDF.index[pd.isnull(_gfDF[_gfDF.columns[survey1522DF[profileColumn]]].T).any()])
# defensive check
_gfDF2 = _gfDF2.drop(getPosttestsWithoutPretests(_gfDF2))
return _gfDF2
In [ ]:
def getPerfectPretestPostestPairsCount(_gfDF):
pairs = getPerfectPretestPostestPairs(_gfDF)
halfPairsCount = len(pairs)//2
uniqueUserIdsCount = len(pairs['userId'].unique())
if (halfPairsCount != uniqueUserIdsCount):
print('warning: halfPairsCount ('+str(halfPairsCount)+') != uniqueUserIdsCount ('+str(uniqueUserIdsCount)+')')
return uniqueUserIdsCount
In [ ]:
resetTemporalities(gform)
#setAnswerTemporalities()
#setAnswerTemporalities2()
setAnswerTemporalitiesSimple(gform)
survey1522DF = getQuestionTypes()
setPosttestsProfileInfo(gform)